{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from keras.layers import Dense, Input, Flatten\n",
    "from keras.layers import GlobalMaxPool1D, Bidirectional, Convolution1D, Embedding, BatchNormalization,MaxPooling1D, Dropout, LSTM\n",
    "from keras import backend as K\n",
    "from keras.engine.topology import Layer\n",
    "from keras import initializers, regularizers, constraints\n",
    "from keras.models import Model\n",
    "from keras.layers.merge import Concatenate\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "INPUT_PATH = '../input/'\n",
    "CACHE_PATH = '../cache/'\n",
    "OUTPUT_PATH ='../output/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = np.load(CACHE_PATH + 'data.npz')\n",
    "X_train = data['X_train']\n",
    "y_train = data['y_train']\n",
    "X_val = data['X_val']\n",
    "y_val = data['y_val']\n",
    "X_test = data['X_test']\n",
    "embedding_matrix = np.load(CACHE_PATH + 'embedding_matrix.npy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "MAX_FEATURES = 20000\n",
    "MAX_SEQUENCE_LENGTH = 300\n",
    "EMBEDDING_DIM = 300"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "embedding_layer = Embedding(MAX_FEATURES,\n",
    "                            EMBEDDING_DIM,\n",
    "                            weights=[embedding_matrix],\n",
    "                            input_length=MAX_SEQUENCE_LENGTH,\n",
    "                            trainable=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "adam 优化器比SGD要好，自适应学习率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_lstm_model():\n",
    "    inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n",
    "    embedded_sequences = embedding_layer(inp)\n",
    "    x = Bidirectional(LSTM(64, dropout=0.25, recurrent_dropout=0.25, return_sequences=True))(embedded_sequences)\n",
    "    x = Bidirectional(LSTM(32, dropout=0.25, recurrent_dropout=0.25, return_sequences=True))(embedded_sequences)\n",
    "    x = Bidirectional(LSTM(32, dropout=0.25, recurrent_dropout=0.25, return_sequences=True))(embedded_sequences)\n",
    "    x = GlobalMaxPool1D()(x)\n",
    "    x = Dropout(0.1)(x)\n",
    "    x = Dense(128, activation=\"relu\")(x)\n",
    "    x = Dropout(0.1)(x)\n",
    "    x = Dense(1, activation=\"linear\")(x)\n",
    "    model = Model(inputs=inp, outputs=x)\n",
    "    model.compile(loss='mse',optimizer='adam')\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def train_lstm_model(model):\n",
    "    model_path = CACHE_PATH + \"lstm_weights_best.hdf5\"\n",
    "    early = EarlyStopping(monitor=\"val_loss\", mode=\"min\", patience=5)\n",
    "    checkpoint = ModelCheckpoint(model_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')\n",
    "    callbacks_list = [checkpoint, early]\n",
    "    model.fit(X_train, y_train, batch_size=128, epochs=100, validation_data=(X_val, y_val), callbacks=callbacks_list)\n",
    "    model.load_weights(model_path)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model = get_lstm_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_1 (InputLayer)         (None, 300)               0         \n",
      "_________________________________________________________________\n",
      "embedding_1 (Embedding)      (None, 300, 300)          6000000   \n",
      "_________________________________________________________________\n",
      "bidirectional_3 (Bidirection (None, 300, 64)           85248     \n",
      "_________________________________________________________________\n",
      "global_max_pooling1d_1 (Glob (None, 64)                0         \n",
      "_________________________________________________________________\n",
      "dropout_1 (Dropout)          (None, 64)                0         \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 128)               8320      \n",
      "_________________________________________________________________\n",
      "dropout_2 (Dropout)          (None, 128)               0         \n",
      "_________________________________________________________________\n",
      "dense_2 (Dense)              (None, 1)                 129       \n",
      "=================================================================\n",
      "Total params: 6,093,697\n",
      "Trainable params: 93,697\n",
      "Non-trainable params: 6,000,000\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.utils.vis_utils import plot_model\n",
    "plot_model(model,to_file='./lstm.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 209000 samples, validate on 11000 samples\n",
      "Epoch 1/100\n",
      "  6144/209000 [..............................] - ETA: 15:31 - loss: 3.7755"
     ]
    }
   ],
   "source": [
    "model = train_lstm_model(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50000/50000 [==============================] - 60s 1ms/step\n"
     ]
    }
   ],
   "source": [
    "y_test = model.predict(X_test,batch_size=128,verbose=1)\n",
    "y_test[y_test < 1] = 1\n",
    "y_test[y_test > 4.7] = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import datetime\n",
    "time = datetime.datetime.now()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sub = pd.read_csv(INPUT_PATH + 'sample.csv',header=None,names=['Id','Score'])\n",
    "sub['Score'] = y_test\n",
    "sub.to_csv(OUTPUT_PATH + 'lstm_{}.csv'.format(time.strftime('%Y-%m-%d-%H:%M:%S')),index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "209000/209000 [==============================] - 252s 1ms/step\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "Length of values does not match length of index",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-16-a376bae75e89>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0my_train\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0my_train\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m4.7\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mdf_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcolumnsname\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     12\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCACHE_PATH\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'train_pred_model.csv'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m   2329\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2330\u001b[0m             \u001b[0;31m# set column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2331\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2332\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2333\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_setitem_slice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_set_item\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m   2395\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2396\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ensure_valid_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2397\u001b[0;31m         \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sanitize_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2398\u001b[0m         \u001b[0mNDFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2399\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_sanitize_column\u001b[0;34m(self, key, value, broadcast)\u001b[0m\n\u001b[1;32m   2566\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2567\u001b[0m             \u001b[0;31m# turn me into an ndarray\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2568\u001b[0;31m             \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_sanitize_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2569\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2570\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m_sanitize_index\u001b[0;34m(data, index, copy)\u001b[0m\n\u001b[1;32m   2877\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2878\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2879\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Length of values does not match length of '\u001b[0m \u001b[0;34m'index'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2880\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2881\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPeriodIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Length of values does not match length of index"
     ]
    }
   ],
   "source": [
    "df_predict = pd.read_csv(CACHE_PATH+'test_pred_model.csv')\n",
    "columnsname = time.strftime(\"%d-%H-%M\")\n",
    "df_predict[columnsname] = y_test\n",
    "df_predict.to_csv(CACHE_PATH+'test_pred_model.csv',index=False)\n",
    "\n",
    "df_train = pd.read_csv(CACHE_PATH + 'train_pred_model.csv')\n",
    "df_train = df_train[df_train['Score'].notnull()].reset_index(drop=True)\n",
    "y_train = model.predict(X_train,batch_size=128,verbose=1)\n",
    "y_train[y_train < 1] = 1\n",
    "y_train[y_train > 4.7] = 5\n",
    "df_train[columnsname] = y_train\n",
    "df_train.to_csv(CACHE_PATH+'train_pred_model.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
